knitr::opts_chunk$set(tidy=TRUE, highlight=TRUE, dev="png",
               cache=TRUE, highlight=TRUE, autodep=TRUE, warning=FALSE, error=FALSE,
               message=FALSE, prompt=TRUE, comment='', fig.cap='')
> library(ggplot2)
> library(dplyr)
> library(tidyr)
> library(biomaRt)
> library(pheatmap)

Overview

This is looking at a first sequencing run of single cells. There are three samples, H1, H2 and H3. There is some prior knowledge that the H1 sample might have some quality problems.

Barcode distributions

For a single-cell experiment, we expect to see a set of barcodes that come from real cells and then a drop. Below we show the barcode distribution for a published experiment on K562 cells and then each of these samples. We look at only the top 100,000 barcodes for each sample.

If you click on the tabs you can see each of the charts. We can see that the samples are missing the characteristic dip in the barcode distribution. Normally we would use this dip to determine a cutoff to filter out real cells.

This may be due to there not being enough depth of sequencing or it may be due to the samples having something wrong with them. It would be helpful to ask the sequencing core what they think about this.

> samples = c("H1", "H2", "H3")
> barcode_files = file.path("umis", samples, "cb-histogram.txt")

K562

> K562_bc = read.table("metadata/k562-histogram.txt", header = FALSE, sep = "\t", 
+     nrows = 1e+05)
> colnames(K562_bc) = c("barcode", "count")
> K562_bc$rank = as.numeric(rownames(K562_bc))
> ggplot(K562_bc, aes(rank, count)) + scale_x_log10() + scale_y_log10() + geom_point() + 
+     theme_bw() + ggtitle("K562 barcode count distribution")

K562 (60 million)

> K562_small = read.table("metadata/k562-histogram-60million.txt", header = FALSE, 
+     sep = "\t", nrows = 1e+05)
> colnames(K562_small) = c("barcode", "count")
> K562_small$rank = as.numeric(rownames(K562_small))
> ggplot(K562_small, aes(rank, count)) + scale_x_log10() + scale_y_log10() + geom_point() + 
+     theme_bw() + ggtitle("K562 barcode count distribution")

H1

> h1_bc = read.table(barcode_files[1], header = FALSE, sep = "\t", nrows = 1e+05)
> colnames(h1_bc) = c("barcode", "count")
> h1_bc$rank = as.numeric(rownames(h1_bc))
> ggplot(h1_bc, aes(rank, count)) + scale_x_log10() + scale_y_log10() + geom_point() + 
+     theme_bw() + ggtitle("H1 barcode count distribution")

H2

> h2_bc = read.table(barcode_files[2], header = FALSE, sep = "\t", nrows = 1e+05)
> colnames(h2_bc) = c("barcode", "count")
> h2_bc$rank = as.numeric(rownames(h2_bc))
> ggplot(h2_bc, aes(rank, count)) + scale_x_log10() + scale_y_log10() + geom_point() + 
+     theme_bw() + ggtitle("H2 barcode count distribution")

H3

> h3_bc = read.table(barcode_files[3], header = FALSE, sep = "\t", nrows = 1e+05)
> colnames(h3_bc) = c("barcode", "count")
> h3_bc$rank = as.numeric(rownames(h3_bc))
> ggplot(h3_bc, aes(rank, count)) + scale_x_log10() + scale_y_log10() + geom_point() + 
+     theme_bw() + ggtitle("H3 barcode count distribution")

Allon’s type barcode plots

This is an R port of Allon’s barcode plots. The matlab command:

[fLog xLog] = hist(log10(counts(counts>0)),50);
y = fLog.*(10.^xLog)/sum(fLog.*(10.^xLog));
plot(xLog,y*100,'linewidth',2)

hist from matlab returns a tuple of counts and centers. R returns a dataframe with mids and counts as columns. So, the R version of this plot is:

> allon_barcode_plot = function(bcs, sample) {
+     bcs_hist = hist(log10(bcs$count), plot = FALSE, n = 50)
+     fLog = bcs_hist$count
+     xLog = bcs_hist$mids
+     y = fLog * (10^xLog)/sum(fLog * (10^xLog))
+     print(qplot(xLog, y) + geom_point() + theme_bw() + ggtitle(sample))
+     return(data.frame(x = xLog, y = y, sample = sample))
+ }

This results in plots where it is easier to call a peak. It looks like we can set a rough cutoff of around 10^4 for the H1 and H3 samples. H2_REFSEQ doesn’t have a clear peak but 10^4 might be a reasonable cutoff.

K562

> k562_allon = allon_barcode_plot(K562_small, "K562")

H1

> h1_allon = allon_barcode_plot(h1_bc, "H1")

H2

> h2_allon = allon_barcode_plot(h2_bc, "H2")

H3

> h3_allon = allon_barcode_plot(h3_bc, "H3")

Combined

> vals = rbind(k562_allon, h1_allon, h2_allon, h3_allon)
> ggplot(vals, aes(x, y, color = sample)) + geom_point() + theme_bw()

Count based quality control

Here we compare the counts for the K562 data as well as for each of the proximal tubule samples, using several different metrics. We counted at the level of the transcript, so before continuing we will collapse the transcript level counts to gene level counts by summing the counts for each transcript for each gene.

> mart = useMart(biomart = "ENSEMBL_MART_ENSEMBL", dataset = "mmusculus_gene_ensembl", 
+     host = "jul2015.archive.ensembl.org")
> conversions = getBM(attributes = c("ensembl_gene_id", "refseq_mrna", "ensembl_transcript_id", 
+     "mgi_symbol"), mart = mart)
> load_counts = function(count_file) {
+     counts = read.table(count_file, header = TRUE, sep = ",")
+     counts = counts %>% tidyr::separate(gene, into = c("transcript", "version"), 
+         sep = "\\.", fill = "right") %>% dplyr::left_join(conversions, c(transcript = "ensembl_transcript_id")) %>% 
+         dplyr::filter(mgi_symbol != "") %>% dplyr::select(-transcript, -version, 
+         -ensembl_gene_id, -refseq_mrna) %>% tidyr::gather(cell, count, -mgi_symbol) %>% 
+         dplyr::group_by(cell, mgi_symbol) %>% dplyr::summarise(counts = sum(count)) %>% 
+         tidyr::spread(cell, counts, fill = 0)
+     counts = as.data.frame(counts)
+     rownames(counts) = counts$mgi_symbol
+     counts$mgi_symbol = NULL
+     return(counts)
+ }
> load_bcbio_refseq = function(count_file) {
+     counts = read.table(count_file, header = TRUE, sep = ",")
+     counts = counts %>% dplyr::left_join(conversions, c(gene = "refseq_mrna")) %>% 
+         dplyr::filter(mgi_symbol != "") %>% dplyr::select(-gene, -ensembl_gene_id, 
+         -ensembl_transcript_id) %>% tidyr::gather(cell, count, -mgi_symbol) %>% 
+         dplyr::group_by(cell, mgi_symbol) %>% dplyr::summarise(counts = sum(count)) %>% 
+         tidyr::spread(cell, counts, fill = 0)
+     counts = as.data.frame(counts)
+     rownames(counts) = counts$mgi_symbol
+     counts$mgi_symbol = NULL
+     return(counts)
+ }
> mart = useMart(biomart = "ENSEMBL_MART_ENSEMBL", dataset = "hsapiens_gene_ensembl", 
+     host = "jul2015.archive.ensembl.org")
> hconversions = getBM(attributes = c("ensembl_gene_id", "refseq_mrna", "ensembl_transcript_id", 
+     "hgnc_symbol"), mart = mart)
> load_hcounts = function(count_file) {
+     counts = read.table(count_file, header = TRUE, sep = ",")
+     counts = counts %>% dplyr::left_join(hconversions, c(gene = "ensembl_transcript_id")) %>% 
+         dplyr::filter(hgnc_symbol != "") %>% dplyr::select(-gene, -ensembl_gene_id, 
+         -refseq_mrna) %>% tidyr::gather(cell, count, -hgnc_symbol, -hgnc_symbol) %>% 
+         dplyr::group_by(cell, hgnc_symbol) %>% dplyr::summarise(counts = sum(count)) %>% 
+         tidyr::spread(cell, counts, fill = 0)
+     counts = as.data.frame(counts)
+     rownames(counts) = counts$hgnc_symbol
+     counts$hgnc_symbol = NULL
+     return(counts)
+ }
> load_k562 = function(count_file) {
+     k562 = read.table(count_file, header = TRUE, sep = ",") %>% tidyr::separate(gene, 
+         into = c("transcript", "symbol"), sep = "\\|") %>% dplyr::select(-transcript) %>% 
+         tidyr::gather(cell, count, -symbol) %>% dplyr::group_by(cell, symbol) %>% 
+         dplyr::summarise(counts = sum(count)) %>% tidyr::spread(cell, counts, 
+         fill = 0)
+     k562 = as.data.frame(k562)
+     rownames(k562) = k562$symbol
+     k562$symbol = NULL
+     k562 = k562[!grepl("ERCC", rownames(k562)), ]
+     return(k562)
+ }
> count_files = file.path("no-ncrna", samples, paste(samples, ".counts", sep = ""))
> h1 = load_counts(count_files[1])
> h2 = load_counts(count_files[2])
> h3 = load_counts(count_files[3])
> count_files = file.path("new-counts", "umis", samples, paste(samples, ".counts", 
+     sep = ""))
> h1_refseq = load_bcbio_refseq(count_files[1])
> h2_refseq = load_bcbio_refseq(count_files[2])
> h3_refseq = load_bcbio_refseq(count_files[3])
> k562_grch37 = load_hcounts("metadata/k562-GRCh37.counts")
> k562_refseq = load_k562("metadata/k562-refseq.counts")

Based on Allon’s plots it looks like 10^4 is the cutoff for the H1-H3 samples. We already used that cutoff when we were deciding which cells to count so we won’t do any further filtering. We need to filter the K562 samples to pick out their peak, which is more like 10^5.

> keep_barcodes = function(barcodes, cutoff) {
+     bc = subset(barcodes, count > cutoff)$barcode
+     return(gsub("-", ".", bc))
+ }
> k562_keep = keep_barcodes(K562_bc, 10^5)
> k562_grch37 = k562_grch37[, k562_keep]
> k562_refseq = k562_refseq[, k562_keep]

Genes Detected

The H1-H3 samples have much less genes detected even when we drop the sequencing down of the K562 samples down to 60 million reads. The numbers of genes detected between the Ensembl and Refseq annotations look similar.

> k562_grch37_detected = data.frame(cell = colnames(k562_grch37), total = colSums(k562_grch37), 
+     detected = colSums(k562_grch37 > 0), sample = "k562_grch37")
> k562_refseq_detected = data.frame(cell = colnames(k562_refseq), total = colSums(k562_refseq), 
+     detected = colSums(k562_refseq > 0), sample = "k562_refseq")
> h1_detected = data.frame(cell = colnames(h1), total = colSums(h1), detected = colSums(h1 > 
+     0), sample = "h1")
> h2_detected = data.frame(cell = colnames(h2), total = colSums(h2), detected = colSums(h2 > 
+     0), sample = "h2")
> h3_detected = data.frame(cell = colnames(h3), total = colSums(h3), detected = colSums(h3 > 
+     0), sample = "h3")
> h1_refseq_detected = data.frame(cell = colnames(h1_refseq), total = colSums(h1_refseq), 
+     detected = colSums(h1_refseq > 0), sample = "h1_refseq")
> h2_refseq_detected = data.frame(cell = colnames(h2_refseq), total = colSums(h2_refseq), 
+     detected = colSums(h2_refseq > 0), sample = "h2_refseq")
> h3_refseq_detected = data.frame(cell = colnames(h3_refseq), total = colSums(h3_refseq), 
+     detected = colSums(h3_refseq > 0), sample = "h3_refseq")
> detected = rbind(k562_grch37_detected, h1_detected, h2_refseq_detected, h3_detected, 
+     k562_refseq_detected, h1_refseq_detected, h2_refseq_detected, h3_refseq_detected)
> detected = transform(detected, cell = reorder(cell, -detected))
> 
> ggplot(detected, aes(sample, detected)) + geom_boxplot() + theme_bw()

Looking at histograms and the boxplots, we see about the same number of genes detected irregardless of what annotation we use.

> ggplot(detected, aes(reorder(cell, detected), detected)) + geom_bar(stat = "identity", 
+     position = "dodge") + facet_wrap(~sample, scale = "free") + theme_bw() + 
+     scale_x_discrete(breaks = NULL) + xlab("cell") + ylab("genes detected")

Total counts

For the K562 samples, the RefSeq annotation captures more counts per cell. This isn’t true of the H1-H3 samples, though.

> ggplot(detected, aes(sample, total)) + geom_boxplot() + theme_bw() + ylab("total counts per cell") + 
+     xlab("")

Complexity

We can also try to measure the complexity of the library, looking at how many different genes are detected plotted against the total number of counts. Here we can see that these libraries have a fairly poor distribution of counts when using the Ensembl annotation for H1-H3. Using the RefSeq annotation, it looks much better. The Ensembl annotation has a bunch of non-coding RNA and non-polyA RNA annotated, much more than the RefSeq annotation which could be leading to the particularly poor quality using the Ensembl annotation. As a comparison, the K562 sample, however, looks fine using either the Ensembl or the RefSeq annotation.

We can see from these plots a poor sequence quality in the libraries, the K562 sample has a fairly steep change in genes detected with an increased total counts. The H1-H3 samples on the other hand have a much less steep increase, a piece of evidence that these libraries are less complex than the K562 lbraries. We are comparing across species here, so the comparison isn’t a great one.

> ggplot(detected, aes(total, detected, color = sample)) + geom_point() + scale_x_log10() + 
+     facet_wrap(~sample) + theme_bw() + xlab("total counts") + ylab("genes detected")

Number of cells genes are expressed

Here we can see a steep dropoff for number of cells a gene is expressed in the H1-H3 samples, meaning there are fewer genes detected as expressed in every cell. The K562 samples, by comparison, have a much less sharp falloff. This indicates that there might be poor capture of RNA from the cells, leading to a less complex library.

> k562_grch37_detected = data.frame(gene = rownames(k562_grch37), total = rowSums(k562_grch37), 
+     detected = rowSums(k562_grch37 > 0), sample = "k562_grch37")
> k562_refseq_detected = data.frame(gene = rownames(k562_refseq), total = rowSums(k562_refseq), 
+     detected = rowSums(k562_refseq > 0), sample = "k562_refseq")
> h1_detected = data.frame(gene = rownames(h1), total = rowSums(h1), detected = rowSums(h1 > 
+     0), sample = "h1")
> h2_detected = data.frame(gene = rownames(h2), total = rowSums(h2), detected = rowSums(h2 > 
+     0), sample = "h2")
> h3_detected = data.frame(gene = rownames(h3), total = rowSums(h3), detected = rowSums(h3 > 
+     0), sample = "h3")
> h1_refseq_detected = data.frame(gene = rownames(h1_refseq), total = rowSums(h1_refseq), 
+     detected = rowSums(h1_refseq > 0), sample = "h1_refseq")
> h2_refseq_detected = data.frame(gene = rownames(h2_refseq), total = rowSums(h2_refseq), 
+     detected = rowSums(h2_refseq > 0), sample = "h2_refseq")
> h3_refseq_detected = data.frame(gene = rownames(h3_refseq), total = rowSums(h3_refseq), 
+     detected = rowSums(h3_refseq > 0), sample = "h3_refseq")
> detected = rbind(k562_grch37_detected, h1_detected, h2_refseq_detected, h3_detected, 
+     k562_refseq_detected, h1_refseq_detected, h2_refseq_detected, h3_refseq_detected)
> detected_df = detected
> detected = transform(detected, gene = reorder(gene, -detected))
> detected = subset(detected, total > 100)
> ggplot(detected, aes(reorder(gene, detected), detected)) + geom_bar(stat = "identity", 
+     position = "dodge") + facet_wrap(~sample, scale = "free") + theme_bw() + 
+     scale_x_discrete(breaks = NULL) + xlab("genes") + ylab("cells detected")

The H1 samples look pretty bad– it is interesting to see that there are sets of genes which are detected as expressed using the Ensembl annotation but not the RefSeq annotation in the H samples.

Heatmaps of top 200 expressed genes

We can see the top 200 expressed genes have very variable expression across the cells. There are many cells where there is no expression; the K562 cells don’t have this issue. We can see there are probably cells in the H* samples that failed.

> top_200_heatmap = function(counts) {
+     counts = as.matrix(counts)
+     counts = head(counts[order(rowSums(counts), decreasing = TRUE), ], 200)
+     pheatmap(log(counts + 1), show_rownames = FALSE, show_colnames = FALSE)
+ }

K562 GRCh37

> top_200_heatmap(k562_grch37)

K562 RefSeq

> top_200_heatmap(k562_refseq)

H1

> top_200_heatmap(h1)

H2

> top_200_heatmap(h2)

H3

> top_200_heatmap(h3)

H1 RefSeq

> top_200_heatmap(h1_refseq)

H2 RefSeq

> top_200_heatmap(h2_refseq)

H3 RefSeq

> top_200_heatmap(h3_refseq)

Drop worst cells

We’ll drop the worst cells from the H* samples, where we define worst as the cells with < 500 genes detected. Using 1000 as the cutoff leaves us with tens of cells. This gets us down to hundreds.

> h1 = h1[, colSums(h1 > 0) > 500]
> h2 = h2[, colSums(h2 > 0) > 500]
> h3 = h3[, colSums(h3 > 0) > 500]
> h1_refseq = h1_refseq[, colSums(h1_refseq > 0) > 500]
> h2_refseq = h2_refseq[, colSums(h2_refseq > 0) > 500]
> h3_refseq = h3_refseq[, colSums(h3_refseq > 0) > 500]

Heatmaps of top 200 expressed genes after filtering

This definitely cleaned up the heatmaps, they look much nicer now. Still pretty spotty expression considering these are the top expressed genes, though.

H1

> top_200_heatmap(h1)

H2

> top_200_heatmap(h2)

H3

> top_200_heatmap(h3)

H1 RefSeq

> top_200_heatmap(h1_refseq)

H2 RefSeq

> top_200_heatmap(h2_refseq)

H3 RefSeq

> top_200_heatmap(h3_refseq)

Most variable genes

Here we take a shot at identifying the most variable genes to do PCA with, it is tough with the samples we have because they are very low counts. I dropped tfile-she cutoff to 1.5 so we could have enough variable genes to do jackStraw sampling later on.

K562 GRCh37

> library(Seurat)
> k562_grch37.dat = k562_grch37
> colnames(k562_grch37.dat) = paste(colnames(k562_grch37.dat), "k562_grch37", 
+     sep = "_")
> k562_grch37.seurat = new("seurat", raw.data = log(k562_grch37.dat + 1))
> k562_grch37.seurat = setup(k562_grch37.seurat, project = "k562_grch37", min.cells = 3, 
+     min.genes = 500, is.expr = 0.01, names.field = 2, names.delim = "_")
> k562_grch37.seurat = mean.var.plot(k562_grch37.seurat, y.cutoff = 2, do.plot = TRUE, 
+     x.low.cutoff = 0.25, x.high.cutoff = 7, fxn.x = expMean, fxn.y = logVarDivMean)

H1

> h1.dat = h1
> colnames(h1.dat) = paste(colnames(h1.dat), "h1", sep = "_")
> h1.seurat = new("seurat", raw.data = log(h1.dat + 1))
> h1.seurat = setup(h1.seurat, project = "h1", min.cells = 3, min.genes = 500, 
+     is.expr = 0.01, names.field = 2, names.delim = "_")
> h1.seurat = mean.var.plot(h1.seurat, y.cutoff = 1.5, do.plot = TRUE, x.low.cutoff = 0.25, 
+     x.high.cutoff = 7, fxn.x = expMean, fxn.y = logVarDivMean)

H2

> h2.dat = h2
> colnames(h2.dat) = paste(colnames(h2.dat), "h2", sep = "_")
> h2.seurat = new("seurat", raw.data = log(h2.dat + 1))
> h2.seurat = setup(h2.seurat, project = "h2", min.cells = 3, min.genes = 500, 
+     is.expr = 0.01, names.field = 2, names.delim = "_")
> h2.seurat = mean.var.plot(h2.seurat, y.cutoff = 1.5, do.plot = TRUE, x.low.cutoff = 0.25, 
+     x.high.cutoff = 7, fxn.x = expMean, fxn.y = logVarDivMean)

H3

> h3.dat = h3
> colnames(h3.dat) = paste(colnames(h3.dat), "h3", sep = "_")
> h3.seurat = new("seurat", raw.data = log(h3.dat + 1))
> h3.seurat = setup(h3.seurat, project = "h3", min.cells = 3, min.genes = 500, 
+     is.expr = 0.01, names.field = 2, names.delim = "_")
> h3.seurat = mean.var.plot(h3.seurat, y.cutoff = 1.5, do.plot = TRUE, x.low.cutoff = 0.25, 
+     x.high.cutoff = 7, fxn.x = expMean, fxn.y = logVarDivMean)

H1 Refseq

> h1_refseq.dat = h1_refseq
> colnames(h1_refseq.dat) = paste(colnames(h1_refseq.dat), "h1_refseq", sep = "_")
> h1_refseq.seurat = new("seurat", raw.data = log(h1_refseq.dat + 1))
> h1_refseq.seurat = setup(h1_refseq.seurat, project = "h1_refseq", min.cells = 3, 
+     min.genes = 500, is.expr = 0.01, names.field = 2, names.delim = "_")
> h1_refseq.seurat = mean.var.plot(h1_refseq.seurat, y.cutoff = 1.5, do.plot = TRUE, 
+     x.low.cutoff = 0.25, x.high.cutoff = 7, fxn.x = expMean, fxn.y = logVarDivMean)

H2 Refseq

> h2_refseq.dat = h2_refseq
> colnames(h2_refseq.dat) = paste(colnames(h2_refseq.dat), "h2_refseq", sep = "_")
> h2_refseq.seurat = new("seurat", raw.data = log(h2_refseq.dat + 1))
> h2_refseq.seurat = setup(h2_refseq.seurat, project = "h2_refseq", min.cells = 3, 
+     min.genes = 500, is.expr = 0.01, names.field = 2, names.delim = "_")
> h2_refseq.seurat = mean.var.plot(h2_refseq.seurat, y.cutoff = 1.5, do.plot = TRUE, 
+     x.low.cutoff = 0.25, x.high.cutoff = 7, fxn.x = expMean, fxn.y = logVarDivMean)

H3 Refseq

> h3_refseq.dat = h3_refseq
> colnames(h3_refseq.dat) = paste(colnames(h3_refseq.dat), "h3_refseq", sep = "_")
> h3_refseq.seurat = new("seurat", raw.data = log(h3_refseq.dat + 1))
> h3_refseq.seurat = setup(h3_refseq.seurat, project = "h3_refseq", min.cells = 3, 
+     min.genes = 500, is.expr = 0.01, names.field = 2, names.delim = "_")
> h3_refseq.seurat = mean.var.plot(h3_refseq.seurat, y.cutoff = 1.5, do.plot = TRUE, 
+     x.low.cutoff = 0.25, x.high.cutoff = 7, fxn.x = expMean, fxn.y = logVarDivMean)

Most variable gene tables

K562 GRCh37

> k562_grch37.seurat@var.genes
  [1] "ABCE1"     "ABHD10"    "ACBD4"     "AFAP1-AS1" "AGA"      
  [6] "AKT1"      "ALS2"      "ANGEL2"    "AP3M1"     "APOE"     
 [11] "ARMC7"     "ASB7"      "ATG2B"     "ATG4B"     "ATXN1"    
 [16] "ATXN7L1"   "B3GNTL1"   "BBX"       "C10orf12"  "C18orf54" 
 [21] "C19orf12"  "C19orf48"  "C20orf203" "C2orf88"   "CCDC117"  
 [26] "CDADC1"    "CDK19"     "CDK5R1"    "CEP164"    "CETN2"    
 [31] "CETN3"     "CHTF18"    "COL1A2"    "COLGALT2"  "CREBL2"   
 [36] "CSTF1"     "DCUN1D4"   "DDIT3"     "DKC1"      "DNAJC27"  
 [41] "E2F1"      "EEA1"      "EEF2K"     "EFTUD1"    "EGLN3"    
 [46] "ELMO1"     "ERLIN1"    "EXTL2"     "FAM171A1"  "FAM222B"  
 [51] "FAM3C"     "FBXO11"    "FCGR2A"    "FER"       "FGD4"     
 [56] "FGFBP3"    "GEMIN5"    "GORASP2"   "GPR183"    "GPR63"    
 [61] "HBZ"       "HCCS"      "HDAC4"     "HDHD1"     "HIP1"     
 [66] "HIST1H1B"  "HMGB3"     "HOXB7"     "HOXC6"     "IGFBP4"   
 [71] "IP6K2"     "IQGAP2"    "ISPD-AS1"  "JMY"       "KIAA1715" 
 [76] "KLHL12"    "KLHL26"    "LYPLA1"    "MED21"     "MED9"     
 [81] "MIER1"     "MKLN1"     "MLEC"      "MORF4L2"   "MYADM"    
 [86] "NCK1"      "NEMF"      "NFIL3"     "NUDT16P1"  "NUP62"    
 [91] "ORMDL1"    "PAGE1"     "PGR"       "PHOSPHO2"  "PIM1"     
 [96] "PPP2R4"    "PRNP"      "PTK2B"     "PTPN7"     "PUF60"    
[101] "R3HCC1L"   "RAD23B"    "RBM17"     "RC3H1"     "SF1"      
[106] "SFMBT1"    "SGOL2"     "SGSH"      "SKAP2"     "SLC25A34" 
[111] "SLC29A3"   "SLC38A1"   "SLC50A1"   "SORCS1"    "SOS2"     
[116] "SSX2IP"    "STAT6"     "STK38"     "STT3A"     "SUV420H1" 
[121] "TAL2"      "TCF7L1"    "TMCC1-AS1" "TMEM14A"   "TMEM158"  
[126] "TMEM74B"   "TP73"      "TRAF3"     "TRIB3"     "TRIP12"   
[131] "TSPYL4"    "UBOX5"     "UGP2"      "USP2-AS1"  "USPL1"    
[136] "UTP14A"    "VPS36"     "WDFY2"     "WTAP"      "XYLB"     
[141] "ZBTB42"    "ZCCHC7"    "ZNF33A"    "ZNF33B"    "ZNF343"   
[146] "ZNF35"     "ZNF507"    "ZNF638"   

H1

> h1.seurat@var.genes
  [1] "6330416G13Rik" "Abl1"          "Actr1a"        "Actr1b"       
  [5] "Acy1"          "Add3"          "Adgrl2"        "Adi1"         
  [9] "Adipor2"       "Adtrp"         "Aebp2"         "Akr1e1"       
 [13] "Akt1"          "Amotl2"        "Ankzf1"        "Api5"         
 [17] "App"           "Arhgap15"      "Asb8"          "Ascc1"        
 [21] "Asxl1"         "Atf7"          "Atn1"          "Atp11b"       
 [25] "Atp6v1b2"      "Atpaf1"        "Bag2"          "Banf1"        
 [29] "Bcl11b"        "Bivm"          "Bmpr1a"        "Bmpr2"        
 [33] "Bms1"          "Btrc"          "Cadm1"         "Camta1"       
 [37] "Cant1"         "Canx"          "Caprin1"       "Cast"         
 [41] "Ccbl1"         "Ccdc127"       "Ccnk"          "Cd47"         
 [45] "Cdk13"         "Cdk19"         "Cdr2"          "Cebpb"        
 [49] "Celsr2"        "Cflar"         "Clcn4-2"       "Clip4"        
 [53] "Cnot6l"        "Cpq"           "Crtap"         "Ctns"         
 [57] "Cxxc1"         "Cyp4a12a"      "Cyp4a12b"      "D8Ertd82e"    
 [61] "Dapk1"         "Ddx58"         "Ddx6"          "Dennd6a"      
 [65] "Derl2"         "Dmtn"          "Dnajc9"        "Dpm2"         
 [69] "Dpp9"          "Dync1li2"      "Ecd"           "Efr3a"        
 [73] "Eif2ak1"       "Elp2"          "Elp3"          "Eml3"         
 [77] "Enpp1"         "Entpd6"        "Epdr1"         "Eps15"        
 [81] "F8a"           "Fam163a"       "Fam53c"        "Fbrsl1"       
 [85] "Fbxl20"        "Fchsd2"        "Fhod3"         "Fibp"         
 [89] "Fitm1"         "Fkbp1a"        "Fmo5"          "Fnbp1"        
 [93] "Frs3"          "Fundc2"        "Fzr1"          "G6pc3"        
 [97] "Gab1"          "Gas1"          "Gatad2b"       "Gatm"         
[101] "Gbas"          "Gbe1"          "Gbp7"          "Gid8"         
[105] "Gjb2"          "Gm14226"       "Gng12"         "Gnpda1"       
[109] "Gpm6a"         "Gstt3"         "Gtf2i"         "Gtpbp1"       
[113] "Hbp1"          "Hbs1l"         "Herc4"         "Hipk3"        
[117] "Hmgb3"         "Hnrnpc"        "Hoxb2"         "Hp1bp3"       
[121] "Hps5"          "Hsd17b7"       "Htatip2"       "Ilf3"         
[125] "Inpp5f"        "Irf2bp1"       "Jade3"         "Kat2b"        
[129] "Keap1"         "Kif1b"         "Klhl12"        "Kras"         
[133] "Krit1"         "Ldlr"          "Ldlrad3"       "Lhx1"         
[137] "Lin7c"         "Lpar3"         "Mad2l1bp"      "Map2k7"       
[141] "Map7"          "Mapt"          "Mavs"          "Max"          
[145] "Mea1"          "Mettl21b"      "Mfsd9"         "Mospd3"       
[149] "Mpped2"        "Mrrf"          "Myo1e"         "Ncor1"        
[153] "Neo1"          "Nolc1"         "Npepps"        "Nras"         
[157] "Nt5dc2"        "Nucb1"         "Odf2"          "Pacs2"        
[161] "Pan2"          "Papss1"        "Patl1"         "Pax8"         
[165] "Pcgf5"         "Pcna"          "Pex11b"        "Pex16"        
[169] "Phf12"         "Phf2"          "Pias4"         "Pla2g12a"     
[173] "Pld3"          "Polr2d"        "Ppp2r2d"       "Prpsap2"      
[177] "Psmg2"         "Ptpn1"         "Puf60"         "Qprt"         
[181] "Rab18"         "Rbbp5"         "Rbm33"         "Rbm4"         
[185] "Rbm4b"         "Rbmx"          "Rbpms"         "Reep3"        
[189] "Reep6"         "Repin1"        "Rexo2"         "Rfx1"         
[193] "Rnd2"          "Rnf152"        "Rsu1"          "Samd1"        
[197] "Scamp5"        "Sdf4"          "Sec23b"        "Sec31a"       
[201] "Serbp1"        "Sh3gl1"        "Sh3gl2"        "Shmt2"        
[205] "Slc22a8"       "Slc25a45"      "Snx13"         "Socs7"        
[209] "Sp1"           "Spin1"         "Sptb"          "Srek1ip1"     
[213] "Srsf7"         "Stard10"       "Stk24"         "Strn4"        
[217] "Stx12"         "Syne4"         "Sypl2"         "Taok2"        
[221] "Tapbp"         "Tbc1d5"        "Tbl2"          "Tcerg1"       
[225] "Tcf25"         "Terf2"         "Tfg"           "Tgfbr3"       
[229] "Them7"         "Tjp2"          "Tlcd1"         "Tmem106c"     
[233] "Tmem135"       "Tmem160"       "Tmem185b"      "Tnrc6b"       
[237] "Tomm34"        "Trappc1"       "Trappc9"       "Trip10"       
[241] "Tsc2"          "Tspan14"       "Tubgcp2"       "Txndc12"      
[245] "Txnrd3"        "Uba1"          "Uba3"          "Ube2w"        
[249] "Ubl4"          "Ucp2"          "Unc5c"         "Usp10"        
[253] "Usp47"         "Vps13d"        "Wbp5"          "Yeats4"       
[257] "Ywhaz"         "Zdhhc6"        "Zfp131"        "Zfp322a"      
[261] "Zfp839"        "Zfp963"        "Zmiz1"         "Zrsr2"        
[265] "Zswim8"        "Zwint"        

H2

> h2.seurat@var.genes
  [1] "Aadat"    "Abca1"    "Abl1"     "Actr1b"   "Add1"     "Akr1c18" 
  [7] "Akt1"     "Alg11"    "Anks1"    "Ano6"     "Anxa5"    "Ap3s1"   
 [13] "Apela"    "Api5"     "Aqp4"     "Arid2"    "Atf2"     "Atg2b"   
 [19] "Avl9"     "Banf1"    "Bcl11b"   "Bcl2"     "Bcl9l"    "Bivm"    
 [25] "Calcoco1" "Casp8"    "Cd36"     "Cdc27"    "Cdk10"    "Cdk19"   
 [31] "Cdkn1a"   "Cdkn2c"   "Cebpb"    "Celsr2"   "Cep164"   "Cep170b" 
 [37] "Chka"     "Clcn4-2"  "Clec2h"   "Cmah"     "Cnot1"    "Col27a1" 
 [43] "Copg1"    "Coro1c"   "Cpsf1"    "Cpsf2"    "Cryab"    "Csrnp3"  
 [49] "Cxxc1"    "Cyp2e1"   "Cyp4a12a" "Dapk1"    "Dcun1d3"  "Ddx58"   
 [55] "Ddx6"     "Deaf1"    "Dennd6a"  "Derl2"    "Dmtn"     "Dnajc5"  
 [61] "Dpep1"    "Dpp4"     "Dusp22"   "Dync1li2" "Efr3a"    "Eif1ad"  
 [67] "Eif2ak1"  "Eif4ebp2" "Elf2"     "Elp3"     "Ergic2"   "Etnppl"  
 [73] "Etv5"     "Fam114a2" "Fam179b"  "Fbxl20"   "Fchsd2"   "Fkbp1a"  
 [79] "Fnbp1"    "Fndc3b"   "Fundc2"   "Fzr1"     "G6pc3"    "Gab1"    
 [85] "Gas1"     "Gbe1"     "Gipc1"    "Glis1"    "Gmppa"    "Gng12"   
 [91] "Gpatch8"  "Gpm6a"    "Gsk3a"    "Gstz1"    "Gtpbp1"   "Hbs1l"   
 [97] "Hdc"      "Hmgb3"    "Hnrnpc"   "Hoxc10"   "Hp1bp3"   "Hsd17b14"
[103] "Hsd17b7"  "Igf2"     "Ilf3"     "Irf2bp1"  "Irf6"     "Itgb1"   
[109] "Itsn2"    "Jun"      "Kap"      "Kat6b"    "Kctd13"   "Keap1"   
[115] "Ldlr"     "Ldlrad3"  "Lias"     "Lin7c"    "Lrrc40"   "Lrrc8a"  
[121] "Lztfl1"   "Magt1"    "Map2k7"   "Map7"     "Mapt"     "Max"     
[127] "Mdm4"     "Mettl21b" "Mfsd9"    "Mmab"     "Mrrf"     "Myo1e"   
[133] "N4bp1"    "Napg"     "Napsa"    "Necap1"   "Nfyc"     "Nolc1"   
[139] "Nrep"     "Nsf"      "Nsun4"    "Nucb1"    "Nudcd1"   "Pabpn1"  
[145] "Pacs2"    "Pafah1b2" "Paip1"    "Pak1"     "Pan2"     "Paqr7"   
[151] "Patl1"    "Pbx1"     "Pcsk7"    "Pde6d"    "Pex11b"   "Pex16"   
[157] "Plbd2"    "Pls1"     "Pnisr"    "Polm"     "Pphln1"   "Ppm1h"   
[163] "Ppp2r2a"  "Ppp2r2d"  "Ppp3cb"   "Psat1"    "Ptbp3"    "Ptpdc1"  
[169] "Ptpn1"    "Pygo2"    "Qprt"     "R3hdm4"   "Rab12"    "Rabgap1" 
[175] "Rbbp5"    "Rbm4"     "Reep3"    "Rhot1"    "Rhot2"    "Rnf152"  
[181] "Rpl10"    "Rqcd1"    "Sae1"     "Scd1"     "Sdf4"     "Sec23b"  
[187] "Sept7"    "Serbp1"   "Setd6"    "Sfmbt1"   "Sfxn5"    "Sh3gl1"  
[193] "Shmt2"    "Slc12a2"  "Slc22a2"  "Slc22a7"  "Slc22a8"  "Slc25a45"
[199] "Slc35a5"  "Slc35e2"  "Slc39a9"  "Smarca2"  "Snx10"    "Snx13"   
[205] "Snx15"    "Socs7"    "Sp1"      "Sp2"      "Spats2"   "Spc25"   
[211] "Spidr"    "Spin1"    "Sptb"     "Srsf7"    "Stard5"   "Strn"    
[217] "Strn4"    "Stx16"    "Syde1"    "Taok2"    "Tbc1d10b" "Tcf25"   
[223] "Tfec"     "Tfg"      "Them7"    "Tjap1"    "Tmem161b" "Tmem41a" 
[229] "Tmem52b"  "Tnrc6b"   "Tom1"     "Tonsl"    "Tpm4"     "Traf6"   
[235] "Trappc1"  "Trip10"   "Trip6"    "Tsc2"     "Ttc9c"    "Ube2w"   
[241] "Ucp2"     "Ugt2b38"  "Ugt3a1"   "Unc5c"    "Usp1"     "Usp14"   
[247] "Usp47"    "Usp6nl"   "Utrn"     "Vasp"     "Vti1a"    "Wbp5"    
[253] "Wdtc1"    "Ypel2"    "Ywhaz"    "Zdhhc6"   "Zfp131"   "Zfp322a" 
[259] "Zfp618"   "Zfp839"   "Zscan22"  "Zwint"   

H3

> h3.seurat@var.genes
  [1] "2310022B05Rik" "9430016H08Rik" "Aadat"         "Abcb7"        
  [5] "Abhd10"        "Actr1b"        "Adipor2"       "Ago2"         
  [9] "Ahnak"         "Akt1"          "Aldh1a1"       "Ankib1"       
 [13] "Anxa4"         "Ap1m1"         "Api5"          "Arhgef7"      
 [17] "Arid2"         "Asb8"          "Atat1"         "Atn1"         
 [21] "AU040320"      "Banf1"         "BC005624"      "Bmp7"         
 [25] "Bscl2"         "Cast"          "Cdk13"         "Cdk19"        
 [29] "Cebpb"         "Celsr2"        "Cep97"         "Cflar"        
 [33] "Chic2"         "Cit"           "Clcn4-2"       "Clic5"        
 [37] "Clip4"         "Clock"         "Cmah"          "Cops2"        
 [41] "Cpeb3"         "Cpq"           "Cpsf1"         "Cpsf7"        
 [45] "Crtap"         "Cxxc1"         "Cyp20a1"       "Cyp4a31"      
 [49] "Dcaf7"         "Dctn1"         "Dennd6a"       "Derl2"        
 [53] "Dmtn"          "Dnm3"          "Dusp16"        "Dync2li1"     
 [57] "E430025E21Rik" "Efr3a"         "Egf"           "Eif1ad"       
 [61] "Elp2"          "Elp3"          "Eml3"          "Ephx2"        
 [65] "Epm2aip1"      "Erc1"          "Ercc3"         "Ero1l"        
 [69] "Etv5"          "Ezh1"          "Fam114a2"      "Fam210b"      
 [73] "Fam63a"        "Fbrsl1"        "Fign"          "Fkbp1a"       
 [77] "Gab1"          "Galns"         "Gapvd1"        "Gbas"         
 [81] "Gchfr"         "Gipc1"         "Gng12"         "Gnpat"        
 [85] "Gnpda1"        "Gpkow"         "Gpm6a"         "Gstm7"        
 [89] "Gstt3"         "H2-Ab1"        "Hbs1l"         "Hnrnpc"       
 [93] "Hp1bp3"        "Hps5"          "Hs1bp3"        "Ids"          
 [97] "Igf1r"         "Isoc1"         "Itm2c"         "Jund"         
[101] "Kap"           "Kat2b"         "Kat6a"         "Kdm4c"        
[105] "Kdm6b"         "Keap1"         "Klhl12"        "Krtcap2"      
[109] "Lims1"         "Lin7c"         "Lurap1l"       "Ly6a"         
[113] "Mad2l1bp"      "Map7"          "Mapt"          "Marveld2"     
[117] "Mavs"          "Max"           "Mcmbp"         "Mea1"         
[121] "Med16"         "Med17"         "Mfap3l"        "Mlxipl"       
[125] "Mmachc"        "Mmd"           "Mrrf"          "Mt1"          
[129] "Mtap"          "N4bp1"         "Naa35"         "Nfkb1"        
[133] "Nicn1"         "Nolc1"         "Nono"          "Nsun3"        
[137] "Nt5dc3"        "Nucb1"         "Nudt4"         "Osr2"         
[141] "Pacs2"         "Pak1ip1"       "Papd5"         "Patl1"        
[145] "Paxbp1"        "Pcmtd1"        "Pcnt"          "Pex14"        
[149] "Pex16"         "Pgm2"          "Pias4"         "Pld3"         
[153] "Pls1"          "Pnkd"          "Poglut1"       "Ppp1r1b"      
[157] "Ppp2r2d"       "Prkacb"        "Psat1"         "Ptpn1"        
[161] "Pygo2"         "Rab18"         "Rbbp5"         "Rbm15b"       
[165] "Rbm4"          "Rbpms"         "Reep6"         "Rexo2"        
[169] "Rnf152"        "Rnf166"        "Rsu1"          "Samd1"        
[173] "Sat2"          "Sdad1"         "Sec23b"        "Serbp1"       
[177] "Sgsm2"         "Sh3bgrl2"      "Shmt2"         "Slc12a1"      
[181] "Slc51a"        "Slc52a2"       "Snapin"        "Snx13"        
[185] "Sp1"           "Spin1"         "Srsf7"         "Stk24"        
[189] "Stx12"         "Syngr1"        "Sypl2"         "Tapt1"        
[193] "Tbc1d10b"      "Tbp"           "Tcf25"         "Tfcp2"        
[197] "Them7"         "Tmem52b"       "Tmem72"        "Tnfsf12"      
[201] "Tonsl"         "Tpgs2"         "Tpm4"          "Traf6"        
[205] "Trappc10"      "Tsc2"          "Ttr"           "Uba1"         
[209] "Ube2w"         "Ubl4"          "Ucp2"          "Umod"         
[213] "Upf3b"         "Uqcc1"         "Uso1"          "Usp1"         
[217] "Usp30"         "Usp47"         "Vps4a"         "Vti1a"        
[221] "Wfdc15b"       "Wwp1"          "Xpot"          "Zcchc3"       
[225] "Zdhhc6"        "Zfp655"        "Zfp787"        "Zfp810"       
[229] "Zfp963"        "Zfyve9"        "Zranb2"        "Zwint"        

H1 Refseq

> h1_refseq.seurat@var.genes
  [1] "6330416G13Rik" "Abhd5"         "Adi1"          "Adipor2"      
  [5] "Agtpbp1"       "Akt1"          "Ap2m1"         "Apex1"        
  [9] "Api5"          "Arhgef7"       "Arsb"          "Ascc1"        
 [13] "Atg13"         "Atn1"          "Atp11b"        "Atp6v1b2"     
 [17] "Atp6v1d"       "Atxn7l3b"      "Bag2"          "Bcl11b"       
 [21] "Bivm"          "Bmpr2"         "Bms1"          "Brpf3"        
 [25] "C130074G19Rik" "Cadm1"         "Capns1"        "Cdr2"         
 [29] "Celsr2"        "Cfl1"          "Ciapin1"       "Clcn6"        
 [33] "Clip1"         "Copg1"         "Cox8a"         "Crtap"        
 [37] "Ctnnd1"        "Ctns"          "Cux1"          "Cxxc1"        
 [41] "Cyp4a10"       "Cyp4a12a"      "Cyp4a12b"      "D8Ertd82e"    
 [45] "Dcaf13"        "Dctn1"         "Ddah1"         "Ddx58"        
 [49] "Dhx40"         "Dnajc9"        "Dpm2"          "Dpp9"         
 [53] "Dync1li2"      "Ecd"           "Efr3a"         "Eif2ak1"      
 [57] "Elp2"          "Emc1"          "Eml3"          "Entpd4"       
 [61] "Entpd6"        "Epdr1"         "Eps15"         "Ergic3"       
 [65] "Ezh1"          "Faah"          "Fam163a"       "Fam63a"       
 [69] "Fam89b"        "Fbrsl1"        "Fitm1"         "Fkbp4"        
 [73] "Fnta"          "Fundc2"        "Fzr1"          "G6pc3"        
 [77] "Gas1"          "Gatm"          "Gbe1"          "Gjb2"         
 [81] "Gle1"          "Gmppa"         "Gstt3"         "Gtpbp1"       
 [85] "Hadh"          "Hbp1"          "Hook3"         "Hp1bp3"       
 [89] "Hps5"          "Hsph1"         "Ifrd1"         "Igf1r"        
 [93] "Irf2bp1"       "Itgb1"         "Itm2c"         "Jade2"        
 [97] "Lage3"         "Lin7c"         "Lpar3"         "Lpar6"        
[101] "Lrrc61"        "Mafk"          "Mast4"         "Max"          
[105] "Minpp1"        "Mob3b"         "Mpped2"        "Myo1e"        
[109] "Npepps"        "Nras"          "Nt5dc2"        "Nub1"         
[113] "Odf2"          "Pak1"          "Pak1ip1"       "Paox"         
[117] "Papss1"        "Patl1"         "Pax2"          "Pax8"         
[121] "Pbx1"          "Pcgf5"         "Pdcl"          "Pde6d"        
[125] "Pdpk1"         "Pex11b"        "Pgm2"          "Pgpep1"       
[129] "Pgs1"          "Phf2"          "Phf3"          "Plekha6"      
[133] "Pls1"          "Pnkd"          "Ppme1"         "Ppp2r2d"      
[137] "Prkce"         "Prlr"          "Prr14l"        "Psenen"       
[141] "Psmg2"         "Ptms"          "Ptpn1"         "Rab11b"       
[145] "Rab4b"         "Rbbp5"         "Rbm4"          "Rbm4b"        
[149] "Rbmx"          "Rbpms"         "Rexo2"         "Rmnd5b"       
[153] "Rnf152"        "Rraga"         "Rsu1"          "Rwdd4a"       
[157] "Samd1"         "Scamp4"        "Sdr39u1"       "Sec23b"       
[161] "Sec31a"        "Slc12a2"       "Slc22a2"       "Slc25a45"     
[165] "Slc35b4"       "Snhg11"        "Snx13"         "Snx3"         
[169] "Socs7"         "Sod3"          "Sp1"           "Srd5a3"       
[173] "Stard5"        "Steap2"        "Stk24"         "Strn4"        
[177] "Stx12"         "Syne4"         "Tagln2"        "Taok2"        
[181] "Tbl2"          "Tecr"          "Tial1"         "Tmem135"      
[185] "Tmem160"       "Tnfrsf21"      "Tnks1bp1"      "Tomm34"       
[189] "Trappc1"       "Trappc9"       "Tti1"          "U2af2"        
[193] "Ubald1"        "Ucp2"          "Ufl1"          "Ugt3a1"       
[197] "Unc5c"         "Usp14"         "Usp46"         "Wbp5"         
[201] "Ywhae"         "Ywhag"         "Ywhaz"         "Zdhhc4"       
[205] "Zdhhc6"        "Zfp358"        "Zfp655"        "Zfx"          
[209] "Zmym4"         "Zwint"        

H2 Refseq

> h2_refseq.seurat@var.genes
  [1] "1600014C10Rik" "Aadat"         "Abca1"         "Abhd4"        
  [5] "Acot7"         "Add1"          "Adi1"          "Akr1c18"      
  [9] "Akt1"          "Ankrd16"       "Anks1"         "Anxa5"        
 [13] "Apela"         "Apex1"         "Apex2"         "Arid2"        
 [17] "Arnt2"         "AW549877"      "Bcl9l"         "Bivm"         
 [21] "Braf"          "Calcoco1"      "Capns1"        "Casp8"        
 [25] "Ccdc9"         "Cdkn2c"        "Clec2h"        "Cnot1"        
 [29] "Coa5"          "Col27a1"       "Copg1"         "Crtap"        
 [33] "Cstf2t"        "Ctnnd1"        "Cxxc1"         "Cyp2e1"       
 [37] "Cyp4a12a"      "Cyp51"         "Dapk1"         "Dcaf7"        
 [41] "Dctn1"         "Dcun1d3"       "Ddah1"         "Ddx58"        
 [45] "Dnajc10"       "Dpm2"          "Efhd1"         "Efr3a"        
 [49] "Eif2ak1"       "Eif4ebp2"      "Elp2"          "Erbb4"        
 [53] "Ergic3"        "Evc2"          "Fbxl20"        "Fchsd2"       
 [57] "Fitm1"         "Fkbp15"        "Fkbp4"         "Fndc3b"       
 [61] "Fundc2"        "Fzr1"          "G6pc3"         "Gas1"         
 [65] "Gbas"          "Gbe1"          "Glis1"         "Gltp"         
 [69] "Gmppa"         "Got1"          "Gpatch8"       "Gps2"         
 [73] "Gstm1"         "Gtpbp1"        "Hdc"           "Hmgcr"        
 [77] "Hoxc10"        "Hp1bp3"        "Hpcal1"        "Hsd17b14"     
 [81] "Irf2bp1"       "Irf6"          "Itgb1"         "Jun"          
 [85] "Kat6a"         "Kctd13"        "Khsrp"         "Klf3"         
 [89] "Lpl"           "Lrrc8a"        "Magt1"         "Manf"         
 [93] "Max"           "Med20"         "Mmab"          "Mrrf"         
 [97] "Napg"          "Napsa"         "Necap1"        "Nfkb1"        
[101] "Npr2"          "Nras"          "Nsf"           "Nsun4"        
[105] "Olfml1"        "Pak1"          "Pak1ip1"       "Patl1"        
[109] "Pbx1"          "Pcna"          "Peli1"         "Pgm2"         
[113] "Pgpep1"        "Plbd2"         "Pls1"          "Pnkd"         
[117] "Polm"          "Ppargc1b"      "Ppic"          "Ppm1h"        
[121] "Ppp2r2d"       "Psat1"         "Ptms"          "Ptpn1"        
[125] "R3hdm4"        "Rab12"         "Rab4b"         "Rbbp5"        
[129] "Rexo2"         "Rmnd5b"        "Rnf152"        "Rqcd1"        
[133] "Rraga"         "Rsu1"          "Scd1"          "Sec31a"       
[137] "Sepp1"         "Setd6"         "Slc12a2"       "Slc22a2"      
[141] "Slc25a45"      "Slc35a3"       "Slc35a5"       "Slc39a9"      
[145] "Snx13"         "Snx15"         "Snx3"          "Sod3"         
[149] "Sp1"           "Spidr"         "Srsf2"         "Stard3nl"     
[153] "Stard5"        "Stk24"         "Taok2"         "Tbc1d10b"     
[157] "Tbcel"         "Tbp"           "Tfec"          "Tm9sf1"       
[161] "Tmem52b"       "Tnks1bp1"      "Top2b"         "Trip6"        
[165] "Ttc9c"         "Ttr"           "Txndc12"       "U2af2"        
[169] "Ubap2"         "Ube4b"         "Ubr4"          "Ucp2"         
[173] "Ugt2b38"       "Usp14"         "Wbp5"          "Wdtc1"        
[177] "Wipf2"         "Ypel2"         "Zdhhc6"        "Zwint"        

H3 Refseq

> h3_refseq.seurat@var.genes
  [1] "1810013L24Rik" "2310022B05Rik" "Aadat"         "Abcb7"        
  [5] "Abhd4"         "Acat2"         "Acbd3"         "Akap13"       
  [9] "Akt1"          "Aldh1a1"       "Anxa4"         "Ap1m1"        
 [13] "Ap2m1"         "Apex1"         "Arhgef7"       "Arid1b"       
 [17] "Arid2"         "Arsb"          "Ascc1"         "Atat1"        
 [21] "Atn1"          "Atp5s"         "Atxn1l"        "Bace1"        
 [25] "BC005624"      "Bmp7"          "Braf"          "C130074G19Rik"
 [29] "Camsap1"       "Capns1"        "Car12"         "Casc3"        
 [33] "Casp8"         "Cbx6"          "Ccdc9"         "Ccnk"         
 [37] "Cd164"         "Cd47"          "Cdc42bpg"      "Cdk5rap3"     
 [41] "Chic1"         "Cldn19"        "Clec2d"        "Clptm1"       
 [45] "Cnot11"        "Cops2"         "Cpeb3"         "Cpt1a"        
 [49] "Crtap"         "Crtc2"         "Ctdnep1"       "Cxcl12"       
 [53] "Cxxc1"         "Cyp4a10"       "Cyp4a31"       "Dcaf7"        
 [57] "Dctn1"         "Dennd4c"       "Derl2"         "Dnase1"       
 [61] "Dnph1"         "Dusp1"         "Dusp22"        "Dync2h1"      
 [65] "Dync2li1"      "Efr3a"         "Egf"           "Eif1ad"       
 [69] "Elp2"          "Emc1"          "Eml3"          "Epm2aip1"     
 [73] "Etv5"          "Exoc2"         "Exoc8"         "Ezh1"         
 [77] "Fabp3"         "Fam175b"       "Fam210b"       "Fam222b"      
 [81] "Fam63a"        "Fbrsl1"        "Fbxo28"        "Fez2"         
 [85] "Fitm1"         "Fkbp15"        "Fndc3b"        "G6pc3"        
 [89] "Gas1"          "Gbas"          "Gbe1"          "Gmppa"        
 [93] "Golga7"        "Gpaa1"         "Gstt3"         "Gtf2a1"       
 [97] "Gtf3c1"        "Hbp1"          "Hcfc1"         "Hdac4"        
[101] "Hdgf"          "Hnrnpdl"       "Hoxb7"         "Hoxb9"        
[105] "Hoxc10"        "Hs1bp3"        "Hspa1b"        "Ido2"         
[109] "Ift52"         "Igf1r"         "Ints5"         "Iqgap1"       
[113] "Itgav"         "Itm2c"         "Kap"           "Kat6a"        
[117] "Kdm4c"         "Kdm6b"         "Kdsr"          "Kif1b"        
[121] "Klhl2"         "Klhl42"        "Kmt2b"         "Krba1"        
[125] "Krcc1"         "Lin7c"         "Lurap1l"       "Mad2l1bp"     
[129] "Map1lc3b"      "Map7"          "Mavs"          "Max"          
[133] "Mcmbp"         "Med19"         "Med20"         "Med30"        
[137] "Mfsd9"         "Mmachc"        "Mpv17l"        "Mt1"          
[141] "Mtap"          "N4bp1"         "Naa15"         "Napsa"        
[145] "Nck1"          "Nono"          "Npr2"          "Nt5dc3"       
[149] "Nub1"          "Nudt16"        "Nudt4"         "Osr2"         
[153] "Pak1ip1"       "Patl1"         "Pck1"          "Pde6d"        
[157] "Pdhx"          "Pdrg1"         "Pex14"         "Pgm2"         
[161] "Pgpep1"        "Plrg1"         "Pls1"          "Pnkd"         
[165] "Ppargc1b"      "Ppm1a"         "Ppp2r2d"       "Prkacb"       
[169] "Psat1"         "Psenen"        "Ptms"          "Ptpn1"        
[173] "R3hdm4"        "Rab11b"        "Rab4b"         "Rbbp5"        
[177] "Rbm4"          "Reep6"         "Rexo1"         "Rexo2"        
[181] "Rnf152"        "Rnf166"        "Rsu1"          "Scamp4"       
[185] "Sdad1"         "Sdha"          "Sema5a"        "Sepp1"        
[189] "Serpina1f"     "Serpinf2"      "Setd5"         "Sf3b4"        
[193] "Slc12a1"       "Slc22a19"      "Slc22a2"       "Slc35a2"      
[197] "Slc51a"        "Slc7a8"        "Smc3"          "Smim19"       
[201] "Snapin"        "Snx13"         "Sp1"           "Spty2d1"      
[205] "Srsf10"        "Ssr1"          "St8sia1"       "Stk24"        
[209] "Stx12"         "Syngr1"        "Tapt1"         "Tbc1d10b"     
[213] "Tbcel"         "Tbp"           "Tm9sf1"        "Tm9sf2"       
[217] "Tm9sf4"        "Tmem186"       "Tmem203"       "Tmem258"      
[221] "Tmem52b"       "Tnfrsf21"      "Tnks1bp1"      "Tpm4"         
[225] "Tpst2"         "Trappc12"      "Trappc9"       "Trim27"       
[229] "Tspan33"       "Tspan9"        "U2af2"         "Ubr4"         
[233] "Ucp2"          "Umod"          "Upf3b"         "Usp5"         
[237] "Vps4a"         "Wee1"          "Wfdc15b"       "Xpot"         
[241] "Yap1"          "Ypel2"         "Ywhae"         "Zcchc3"       
[245] "Zdhhc6"        "Zfp191"        "Zfp655"        "Zfp963"       
[249] "Zranb2"        "Zwint"         "Zyg11b"       

PCA

Just to see, even though the variable genes identified looks wonky, in these samples, we ran PCA on all of the samples to see if any groups pop out.

K562

> k562_grch37.seurat = pca(k562_grch37.seurat, pcs.print = 3, genes.print = 5, 
+     do.print = FALSE)
> pca.plot(k562_grch37.seurat, pt.size = 2)

H1

> h1.seurat = pca(h1.seurat, pcs.print = 3, genes.print = 5, do.print = FALSE)
> pca.plot(h1.seurat, pt.size = 2)

H2

> h2.seurat = pca(h2.seurat, pcs.print = 3, genes.print = 5, do.print = FALSE)
> pca.plot(h2.seurat, pt.size = 2)

H3

> h3.seurat = pca(h3.seurat, pcs.print = 3, genes.print = 5, do.print = FALSE)
> pca.plot(h3.seurat, pt.size = 2)

H1 Refseq

> h1_refseq.seurat = pca(h1_refseq.seurat, pcs.print = 3, genes.print = 5, do.print = FALSE)
> pca.plot(h1_refseq.seurat, pt.size = 2)

H2 Refseq

> h2_refseq.seurat = pca(h2_refseq.seurat, pcs.print = 3, genes.print = 5, do.print = FALSE)
> pca.plot(h2_refseq.seurat, pt.size = 2)

H3 Refseq

> h3_refseq.seurat = pca(h3_refseq.seurat, pcs.print = 3, genes.print = 5, do.print = FALSE)
> pca.plot(h3_refseq.seurat, pt.size = 2)

Housekeeping genes

I pulled a set of housekeeping genes from Evidence Based Selection of Housekeeping Genes and converted them to human and mouse symbols. Then I made heatmaps for all of the housekeeping genes. We can see that for the K562 sample, most of the housekeeping genes are expressed strongly whereas in the H1-H3 samples, there is spare expression of the housekeeping genes.

> hk = as.character(read.table("metadata/hk.txt", header = FALSE)[, "V1"])
> hk = c(hk, c("Actb", "Gapdh"))
> human_hk = as.character(read.table("metadata/human-hk.txt", header = FALSE)[, 
+     "V1"])
> human_hk = c(human_hk, c("Actb", "Gapdh"))
> housekeeping_heatmap = function(counts, hk) {
+     counts = counts[rownames(counts) %in% hk, ]
+     pheatmap(log(counts + 1), show_rownames = TRUE, show_colnames = FALSE)
+ }

K562 GRCh37

> housekeeping_heatmap(k562_grch37, human_hk)

K562 RefSeq

> housekeeping_heatmap(k562_refseq, human_hk)

H1

> housekeeping_heatmap(h1, hk)

H2

> housekeeping_heatmap(h2, hk)

H3

> housekeeping_heatmap(h3, hk)

H1 RefSeq

> housekeeping_heatmap(h1_refseq, hk)

H2 RefSeq

> housekeeping_heatmap(h2_refseq, hk)

H3 RefSeq

> housekeeping_heatmap(h3_refseq, hk)

Summary

These libraries might not be of good enough quality to do anything with. This paper came out in late April and did a nice job breaking down what we can expect to be able to pull out from very shallow single-cell sequencing data. We have on the lower end of what they looked at for many of the cells and have less than what they needed to be able to pull out differences between different cell types of neurons. Here we are looking at two of the same cell type and are looking for what are likely more subtle differences.

Transcriptional programs

We’ll do GO ontology of the top 250 highest weighted genes for the first principal components and compare the results across the H1-H3 samples. If we’re seeing similar genes in each sample, that will give us some evidence that we’re seeing a robust signal at the pathway level.

To do this we pulled out the loadings for the genes along component one from the PCA, and then used those genes to do pathway enrichment analysis using clusterProfiler:

G Yu, LG Wang, Y Han, QY He.
clusterProfiler: an R package for comparing biological themes among gene clusters.
OMICS: A Journal of Integrative Biology 2012, 16(5):284-287.
> library(clusterProfiler)
> library(org.Mm.eg.db)
> seurat_clusterprofiler = function(seurat_obj, component, samplename) {
+     universe = rownames(seurat_obj@data)
+     seurat_obj = project.pca(seurat_obj, do.print = FALSE)
+     genes = seurat_obj@pca.x.full[, component, drop = FALSE]
+     genes = rownames(genes)[order(abs(genes[, 1]), decreasing = TRUE)][1:250]
+     genes.df = bitr(genes, "SYMBOL", c("ENSEMBL", "ENTREZID"), "org.Mm.eg.db")
+     universe.df = bitr(universe, "SYMBOL", c("ENSEMBL", "ENTREZID"), "org.Mm.eg.db")
+     mf = summary(enrichGO(genes.df$ENTREZID, universe = universe.df$ENTREZID, 
+         OrgDb = org.Mm.eg.db, ont = "MF", pAdjustMethod = "BH", qvalueCutoff = 1, 
+         pvalueCutoff = 1))
+     mf$ont = "MF"
+     cc = summary(enrichGO(genes.df$ENTREZID, universe = universe.df$ENTREZID, 
+         OrgDb = org.Mm.eg.db, ont = "CC", pAdjustMethod = "BH", qvalueCutoff = 1, 
+         pvalueCutoff = 1))
+     cc$ont = "CC"
+     bp = summary(enrichGO(genes.df$ENTREZID, universe = universe.df$ENTREZID, 
+         OrgDb = org.Mm.eg.db, ont = "BP", pAdjustMethod = "BH", qvalueCutoff = 1, 
+         pvalueCutoff = 1))
+     bp$ont = "BP"
+     combined = rbind(cc, mf, bp)
+     combined$samplename = samplename
+     return(combined)
+ }
> 
> seurat_clusterprofiler_gsea = function(seurat_obj, component, samplename) {
+     universe = rownames(seurat_obj@data)
+     seurat_obj = project.pca(seurat_obj, do.print = FALSE)
+     genes = seurat_obj@pca.x.full[, component, drop = FALSE]
+     colnames(genes) = c("PC")
+     gene_symbols = rownames(genes)[order(abs(genes[, 1]), decreasing = TRUE)]
+     genes$SYMBOL = rownames(genes)
+     genes.df = bitr(gene_symbols, "SYMBOL", c("ENSEMBL", "ENTREZID"), "org.Mm.eg.db")
+     genes.df = genes.df %>% left_join(genes, by = "SYMBOL")
+     universe.df = bitr(universe, "SYMBOL", c("ENSEMBL", "ENTREZID"), "org.Mm.eg.db")
+     genes = genes.df[, c("ENTREZID", "PC")] %>% unique()
+     gene_pc = genes[, "PC"]
+     names(gene_pc) = genes$ENTREZID
+     gene_pc = gene_pc[order(gene_pc, decreasing = TRUE)]
+     cc = summary(gseGO(gene_pc, ont = "CC", OrgDb = org.Mm.eg.db, nPerm = 500, 
+         minGSSize = 100, pvalueCutoff = 1, pAdjustMethod = "BH", verbose = TRUE))
+     cc$ont = "CC"
+     mf = summary(gseGO(gene_pc, ont = "MF", OrgDb = org.Mm.eg.db, nPerm = 500, 
+         minGSSize = 100, pvalueCutoff = 1, pAdjustMethod = "BH", verbose = TRUE))
+     mf$ont = "MF"
+     bp = summary(gseGO(gene_pc, ont = "bp", OrgDb = org.Mm.eg.db, nPerm = 500, 
+         minGSSize = 100, pvalueCutoff = 1, pAdjustMethod = "BH", verbose = TRUE))
+     bp$ont = "BP"
+     combined = rbind(cc, mf, bp)
+     combined$samplename = samplename
+     return(combined)
+ }
> h1_cp = seurat_clusterprofiler(h1.seurat, 1, "H1")
> h2_cp = seurat_clusterprofiler(h2.seurat, 1, "H2")
> h3_cp = seurat_clusterprofiler(h3.seurat, 1, "H3")
> h1_refseq_cp = seurat_clusterprofiler(h1_refseq.seurat, 1, "H1_refseq")
> h2_refseq_cp = seurat_clusterprofiler(h2_refseq.seurat, 1, "H2_refseq")
> h3_refseq_cp = seurat_clusterprofiler(h3_refseq.seurat, 1, "H3_refseq")
> combinedgo = rbind(h1_cp, h2_cp, h3_cp, h1_refseq_cp, h2_refseq_cp, h3_refseq_cp)

Only the H2 sample has significant hits at the pathway level for the first principal component so we can’t do much in terms of overlapping the calls between the samples.

> siggo = combinedgo %>% dplyr::filter(qvalue < 0.1) %>% group_by(ont, samplename) %>% 
+     summarise(sigpaths = n())
> ggplot(siggo, aes(samplename, sigpaths)) + geom_bar(stat = "identity", position = "dodge") + 
+     facet_wrap(~ont) + theme_bw() + xlab("") + ylab("significant ontology terms")

Taking the top 250 genes in the PCA is pretty arbitrary. Instead we’ll try doing GSEA, which looks for sets of genes that are moving in the same direction, here direction is the weight along the first principal component.

> h1_cp_gsea = seurat_clusterprofiler_gsea(h1.seurat, 1, "H1")
[1] "calculating observed enrichment scores..."
[1] "calculating permutation scores..."

  |                                                                       
  |                                                                 |   0%
[1] "calculating p values..."
[1] "done..."
[1] "calculating observed enrichment scores..."
[1] "calculating permutation scores..."

  |                                                                       
  |                                                                 |   0%
[1] "calculating p values..."
[1] "done..."
[1] "calculating observed enrichment scores..."
[1] "calculating permutation scores..."

  |                                                                       
  |                                                                 |   0%
[1] "calculating p values..."
[1] "done..."
> h2_cp_gsea = seurat_clusterprofiler_gsea(h2.seurat, 1, "H2")
[1] "calculating observed enrichment scores..."
[1] "calculating permutation scores..."

  |                                                                       
  |                                                                 |   0%
[1] "calculating p values..."
[1] "done..."
[1] "calculating observed enrichment scores..."
[1] "calculating permutation scores..."

  |                                                                       
  |                                                                 |   0%
[1] "calculating p values..."
[1] "done..."
[1] "calculating observed enrichment scores..."
[1] "calculating permutation scores..."

  |                                                                       
  |                                                                 |   0%
[1] "calculating p values..."
[1] "done..."
> h3_cp_gsea = seurat_clusterprofiler_gsea(h3.seurat, 1, "H3")
[1] "calculating observed enrichment scores..."
[1] "calculating permutation scores..."

  |                                                                       
  |                                                                 |   0%
[1] "calculating p values..."
[1] "done..."
[1] "calculating observed enrichment scores..."
[1] "calculating permutation scores..."

  |                                                                       
  |                                                                 |   0%
[1] "calculating p values..."
[1] "done..."
[1] "calculating observed enrichment scores..."
[1] "calculating permutation scores..."

  |                                                                       
  |                                                                 |   0%
[1] "calculating p values..."
[1] "done..."
> h1_refseq_cp_gsea = seurat_clusterprofiler_gsea(h1_refseq.seurat, 1, "H1_refseq")
[1] "calculating observed enrichment scores..."
[1] "calculating permutation scores..."

  |                                                                       
  |                                                                 |   0%
[1] "calculating p values..."
[1] "done..."
[1] "calculating observed enrichment scores..."
[1] "calculating permutation scores..."

  |                                                                       
  |                                                                 |   0%
[1] "calculating p values..."
[1] "done..."
[1] "calculating observed enrichment scores..."
[1] "calculating permutation scores..."

  |                                                                       
  |                                                                 |   0%
[1] "calculating p values..."
[1] "done..."
> h2_refseq_cp_gsea = seurat_clusterprofiler_gsea(h2_refseq.seurat, 1, "H2_refseq")
[1] "calculating observed enrichment scores..."
[1] "calculating permutation scores..."

  |                                                                       
  |                                                                 |   0%
[1] "calculating p values..."
[1] "done..."
[1] "calculating observed enrichment scores..."
[1] "calculating permutation scores..."

  |                                                                       
  |                                                                 |   0%
[1] "calculating p values..."
[1] "done..."
[1] "calculating observed enrichment scores..."
[1] "calculating permutation scores..."

  |                                                                       
  |                                                                 |   0%
[1] "calculating p values..."
[1] "done..."
> h3_refseq_cp_gsea = seurat_clusterprofiler_gsea(h3_refseq.seurat, 1, "H3_refseq")
[1] "calculating observed enrichment scores..."
[1] "calculating permutation scores..."

  |                                                                       
  |                                                                 |   0%
[1] "calculating p values..."
[1] "done..."
[1] "calculating observed enrichment scores..."
[1] "calculating permutation scores..."

  |                                                                       
  |                                                                 |   0%
[1] "calculating p values..."
[1] "done..."
[1] "calculating observed enrichment scores..."
[1] "calculating permutation scores..."

  |                                                                       
  |                                                                 |   0%
[1] "calculating p values..."
[1] "done..."
> combinedgsea = rbind(h1_cp_gsea, h2_cp_gsea, h3_cp_gsea, h1_refseq_cp_gsea, 
+     h2_refseq_cp_gsea, h3_refseq_cp_gsea)

GSEA pulls out some significant terms in the H2 and H3 samples:

> siggo = combinedgsea %>% dplyr::filter(qvalues < 0.1) %>% group_by(ont, samplename) %>% 
+     summarise(sigpaths = n())
> ggplot(siggo, aes(samplename, sigpaths)) + geom_bar(stat = "identity", position = "dodge") + 
+     facet_wrap(~ont) + theme_bw() + xlab("") + ylab("significant ontology terms")

The overlap between these ontology terms doesn’t look great, however:

> library(UpSetR)
> sigsets = combinedgsea %>% dplyr::filter(qvalues < 0.1) %>% dplyr::select(ID, 
+     Description, samplename) %>% dplyr::group_by(Description, samplename)
> sigsets$value = 1
> sigsets = sigsets %>% tidyr::spread(samplename, value, fill = 0)
> ss = data.frame(sigsets)
> upset(ss)

I think we can’t do too much with this dataset as it is, the cells are not different enough to pick out anything with the quality problems we’re having.